#####################################################################################################
### TITLE: Create_Texas_PM25_Data.R
### PURPOSE: 
  ### This code extracts spatially-weighted averages of raster PM 2.5 data within Texas zip codes.
  ### The time period of the sample is 2000-2018, and the end product is a wide format data set.
### NOTE: This code uses rgdal, a deprecated package in R. In order to run this code, one must download R 4.1.3
### INPUTS: raw pm 2.5 data, shapefiles
### OUTPUTS: state48_pm25_2000_2018.dt, mean_pm25.csv 
####################################################################################################
# Set CRAN mirror explicitly
options(repos = c(CRAN = "https://cloud.r-project.org"))

# Install required packages if they are not already installed
packages <- c("tidyverse", "readstata13", "sp", "rgdal", "raster", "rgeos", "maptools", "exactextractr", "haven", "dplyr")
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) {
  install.packages(packages[!installed])
}
if (interactive()) {
  if (requireNamespace("rstudioapi", quietly = TRUE) && rstudioapi::isAvailable()) {
    script_path <- rstudioapi::getActiveDocumentContext()$path
    base_dir <- dirname(script_path)
  } else {
    base_dir <- getwd()  # fallback
  }
} else {
  args <- commandArgs(trailingOnly = FALSE)
  script_path <- sub("--file=", "", args[grep("--file=", args)])
  base_dir <- dirname(normalizePath(script_path))
}

# Go 3 directories up
for (i in 1:2) {
  base_dir <- dirname(base_dir)
}

cat("Base directory is:", base_dir, "\n")

setwd(base_dir)

#Load necessary packages.
library(tidyverse)
library(readstata13)

library(sp)
library(rgdal)
library(raster)
library(rgeos)
library(maptools)
library(exactextractr)
library(haven)
library(dplyr)

cat("✅ R script is running\n")


#Bring in US zipcode shapefile (2019 vintage)
us_zcta <- shapefile("raw_data/Shapefiles/Zipcodes/cb_2019_us_zcta510_500k/cb_2019_us_zcta510_500k.shp")

#Bring in US state boundary shapefiles and filter for Texas
state_boundaries <- shapefile("raw_data/Shapefiles/State Boundaries/cb_2019_us_state_500k/cb_2019_us_state_500k.shp")
Texas_boundary <- state_boundaries[state_boundaries$STATEFP == "48", ]

#Keep only the zipcodes which at least partially intersect with Texas' state boundary 
crs(Texas_boundary) <- crs(us_zcta)
Texas_zcta <- raster::intersect(us_zcta, Texas_boundary)
Texas_zcta_df <- as.data.frame(Texas_zcta)

#Extract pollution data for Texas zipcode for the first year
#This will act as the base dataframe to which to attach data from other years 

#Bring in the pollution grid from 2000
pm25 <- maptools::readAsciiGrid("raw_data/Shapefiles/PM25/V4NA03_PM25_NA_200001_200012-RH35-NoNegs.asc/V4NA03_PM25_NA_200001_200012-RH35-NoNegs.asc")

#Equate the crs (Note that pm25 has no crs to begin with, so we use the tracts crs)
crs(pm25) <- crs(Texas_zcta)

#Coerce into a RasterLayer
pm25_r <- as(pm25, "RasterLayer")

#Extract a spatially weighted average of rasters in each zipcode polygon, and omitt NA values
#raster::extract(rasterlayer, polygons, fun = mean, exact = TRUE, weights = TRUE, normalizeWeights = TRUE, df=T, na.rm=T)
#na.rm = T will omit NA values but all Texas zipcodes will still get PM25 data
texas_pm25 <- raster::extract(pm25_r, Texas_zcta, fun = mean, exact = TRUE, weights = TRUE, normalizeWeights = TRUE, df = TRUE, na.rm = T)

#Rename pollution variable to include the year 200 index, and drop the ID variable
colnames(texas_pm25)[2] <- "PM25_2000"
texas_pm25 <- texas_pm25 %>%
  dplyr::select(-c(ID))

#Combine with the Texas zipcode data.
#Note that raster::extract returns values for polygons in the same order as they appear in the spdf
#So we only need to do cbind.
texas_pm25 <- cbind(Texas_zcta_df, texas_pm25)


#Loop through years 2001:2018 and attach to form wide format
for(y in 2001:2018){
  #Bring in the pollution grid
  pm25 <- maptools::readAsciiGrid(paste0("raw_data/Shapefiles/PM25/V4NA03_PM25_NA_",y,"01_",y,"12-RH35-NoNegs.asc/V4NA03_PM25_NA_",y,"01_",y,"12-RH35-NoNegs.asc"))

  #Equate the crs (Note that pm25 has no crs to begin with, so we use the tracts crs)
  crs(pm25) <- crs(Texas_zcta)

  #Coerce PM25 data into a RasterLayer (from SpatialGridDataFrame)
  pm25_r <- as(pm25, "RasterLayer")

  #Extract a spatially weighted average of rasters in each zipcode polygon, and omitt NA values
  #na.rm = T will omit NA values but all Texas zipcodes will still get PM25 data
  newyear <- raster::extract(pm25_r, Texas_zcta, fun = mean, exact = TRUE, weights = TRUE, normalizeWeights = TRUE, df = TRUE, na.rm = T)

  #Rename the pollution variable and drop the id
  colnames(newyear)[2] <- paste0("PM25_",y)
  newyear <- newyear %>%
    dplyr::select(-c(ID))

  #Combine with the other years of pollution data to form a wide-format data frame
  texas_pm25 <- cbind(texas_pm25, newyear)
}


#Save the wide-format data frame as a dta
save.dta13(texas_pm25, "processed_data/state48_pm25_2000_2018.dta")

# Save mean pm 2.5
# Load the dataset
data <- read_dta("processed_data/state48_pm25_2000_2018.dta")

# Compute row-wise mean PM2.5 from 2000 to 2018
data <- data %>%
  mutate(mean_pm25 = rowMeans(dplyr::select(., starts_with("PM25")), na.rm = TRUE)) %>%
  filter(!is.na(mean_pm25))


# Collapse to ZIP-level
collapsed_data <- data %>%
  group_by(ZCTA5CE10) %>%
  summarise(mean_pm25 = mean(mean_pm25, na.rm = TRUE))


# Save the collapsed data to a CSV file
write.csv(collapsed_data, "processed_data/mean_pm25.csv", row.names = FALSE)